The following is an EDA used for findings pertinent to the Fall 2022 project symposium. @author: Atharva Shirke

Import packages (pathlib, pandas, numpy, sklearn, matplotlib, geopandas)

In [ ]:
import pyproj
import shapely
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from pathlib import Path
import folium
import mapclassify
import geopandas
import rtree

espg4326 = pyproj.CRS('+proj=longlat +datum=WGS84 +no_defs +type=crs')
In [ ]:
dataPath = Path("../data/")
In [ ]:
def makeDataFrame(relative_path, geojson=False):
    """
    Given the relative path of a csv in the data
    directory, return a pandas dataframe.
    """
    targetPath = dataPath / Path(relative_path)
    if geojson:
        return geopandas.read_file(targetPath)
    return pd.read_csv(targetPath)

Using this function, let's start by loading up the zoning information we have for the city of Berkeley.

In [ ]:
zones = "raw/zoning.geojson"
zones = makeDataFrame(zones, geojson=True)
zones
Out[ ]:
shape_st_2 area shape_stle blocks_id perimeter lasteditor height lastupdate objectid genplan zoneclass new_block shape_st_1 shape_star zonedesc baseelev zone geometry
0 897.965039154 256240.562 897.96503916 3 2942.173 None 0 None 1 LDR R-1H 062 2949 23871.2937675 23871.2939453 None 0 R MULTIPOLYGON (((-122.27356 37.90460, -122.2735...
1 452.362820501 111175 452.3628205 5 1484.545 None 0 None 2 LDR R-1H 063 2951 10322.7637992 10322.763916 None 0 R MULTIPOLYGON (((-122.26997 37.90433, -122.2699...
2 807.498810905 170184.812 807.49881091 6 2637.073 None 0 None 3 LDR R-1H 062 2942 16129.8272618 16129.8276367 None 0 R MULTIPOLYGON (((-122.27484 37.90388, -122.2745...
3 1011.43398248 307217.938 1011.43398248 7 3510.71 None 0 None 4 LDR R-1H 063 3100 28537.9496481 28537.951416 None 0 R MULTIPOLYGON (((-122.26898 37.90397, -122.2684...
4 433.015174892 101674.438 433.01517489 8 1421.052 None 0 None 5 LDR R-1H 062 2945 9440.55918856 9440.55957031 None 0 R MULTIPOLYGON (((-122.27198 37.90388, -122.2719...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1855 338.715577619 157345.375 338.71557762 436 1725.794 None 0 None 1856 M MULI 060 2355 7153.08030928 7153.08007813 None 0 MULI MULTIPOLYGON (((-122.29736 37.88114, -122.2974...
1856 172.74327643 57831.625 172.74327643 1113 1126.78 None 0 None 1857 M MULI 054 1763 1228.491685 1228.49169922 None 0 MULI MULTIPOLYGON (((-122.28949 37.85835, -122.2893...
1857 157.757642818 63278.188 381.15307606 1532 1248.658 None 0 None 1858 NC C-SA 052 1527 335.41452172 6088.6953125 None 0 C MULTIPOLYGON (((-122.26951 37.84853, -122.2695...
1858 105.835958641 77694.688 445.48575821 1556 1456.261 None 0 None 1859 NC C-AC 052 1435 480.870974915 7457.67993164 None 0 C MULTIPOLYGON (((-122.27333 37.84618, -122.2733...
1859 129.49176163 28538.438 218.92572458 1525 718.455 None 0 None 1860 MDR R-2A 052 1533 996.79170825 2649.86743164 None 0 R MULTIPOLYGON (((-122.27278 37.84918, -122.2732...

1860 rows × 18 columns

In [ ]:
zones.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1860 entries, 0 to 1859
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   shape_st_2  1860 non-null   object  
 1   area        1860 non-null   object  
 2   shape_stle  1860 non-null   object  
 3   blocks_id   1860 non-null   object  
 4   perimeter   1860 non-null   object  
 5   lasteditor  0 non-null      object  
 6   height      1860 non-null   object  
 7   lastupdate  0 non-null      object  
 8   objectid    1860 non-null   object  
 9   genplan     1834 non-null   object  
 10  zoneclass   1860 non-null   object  
 11  new_block   1802 non-null   object  
 12  shape_st_1  1860 non-null   object  
 13  shape_star  1860 non-null   object  
 14  zonedesc    0 non-null      object  
 15  baseelev    1860 non-null   object  
 16  zone        1816 non-null   object  
 17  geometry    1860 non-null   geometry
dtypes: geometry(1), object(17)
memory usage: 261.7+ KB

Let's drop all null columns before we start analyzing the data.

In [ ]:
zones = zones.drop(columns=["zonedesc", "lastupdate", "lasteditor"])

Based on the specifications of GIS data, we know what shape_*, geometry, and area columns do. These are components used for defining and drawing the polygons we use for defining zones. Short descriptions below:

  • geometry: a column containing polygon objects encoded with all shape information (including the columns below)
  • shape_star: area of polygon (in meters)
  • shape_stle: perimeter of polygon (in meters)
  • shape_st_1: unclear... will require ArcGIS Pro
  • shape_st_2: unclear... will require ArcGis Pro

Because we know all the information is encoded in geometry, let's drop the cols with redundant information and put it in a new dataframe.

In [ ]:
blocks = zones.drop(columns=["shape_star", "shape_stle", "shape_st_1", "shape_st_2"])
blocks
Out[ ]:
area blocks_id perimeter height objectid genplan zoneclass new_block baseelev zone geometry
0 256240.562 3 2942.173 0 1 LDR R-1H 062 2949 0 R MULTIPOLYGON (((-122.27356 37.90460, -122.2735...
1 111175 5 1484.545 0 2 LDR R-1H 063 2951 0 R MULTIPOLYGON (((-122.26997 37.90433, -122.2699...
2 170184.812 6 2637.073 0 3 LDR R-1H 062 2942 0 R MULTIPOLYGON (((-122.27484 37.90388, -122.2745...
3 307217.938 7 3510.71 0 4 LDR R-1H 063 3100 0 R MULTIPOLYGON (((-122.26898 37.90397, -122.2684...
4 101674.438 8 1421.052 0 5 LDR R-1H 062 2945 0 R MULTIPOLYGON (((-122.27198 37.90388, -122.2719...
... ... ... ... ... ... ... ... ... ... ... ...
1855 157345.375 436 1725.794 0 1856 M MULI 060 2355 0 MULI MULTIPOLYGON (((-122.29736 37.88114, -122.2974...
1856 57831.625 1113 1126.78 0 1857 M MULI 054 1763 0 MULI MULTIPOLYGON (((-122.28949 37.85835, -122.2893...
1857 63278.188 1532 1248.658 0 1858 NC C-SA 052 1527 0 C MULTIPOLYGON (((-122.26951 37.84853, -122.2695...
1858 77694.688 1556 1456.261 0 1859 NC C-AC 052 1435 0 C MULTIPOLYGON (((-122.27333 37.84618, -122.2733...
1859 28538.438 1525 718.455 0 1860 MDR R-2A 052 1533 0 R MULTIPOLYGON (((-122.27278 37.84918, -122.2732...

1860 rows × 11 columns

Load in the visualization.

In [ ]:
blocks.explore()
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

This visualization has brought us some interesting insights:

  • height doesn't seem to mean anything... most of the height features are set to 0
  • same with base_elev. Looks like our dataset doesn't have any z-axis information
  • some of our blocks don't have perimeters or areas calculated. We should recompute them all with geopandas
  • block_id and new_block both indicate a change in how gov surveyors have enumerated zones.

Let's drop all the z-axis information columns and recompute areas and perimeters.

In [ ]:
blocks = blocks.drop(columns=["baseelev", "height"])
In [ ]:
blocks
Out[ ]:
area blocks_id perimeter objectid genplan zoneclass new_block zone geometry
0 256240.562 3 2942.173 1 LDR R-1H 062 2949 R MULTIPOLYGON (((-122.27356 37.90460, -122.2735...
1 111175 5 1484.545 2 LDR R-1H 063 2951 R MULTIPOLYGON (((-122.26997 37.90433, -122.2699...
2 170184.812 6 2637.073 3 LDR R-1H 062 2942 R MULTIPOLYGON (((-122.27484 37.90388, -122.2745...
3 307217.938 7 3510.71 4 LDR R-1H 063 3100 R MULTIPOLYGON (((-122.26898 37.90397, -122.2684...
4 101674.438 8 1421.052 5 LDR R-1H 062 2945 R MULTIPOLYGON (((-122.27198 37.90388, -122.2719...
... ... ... ... ... ... ... ... ... ...
1855 157345.375 436 1725.794 1856 M MULI 060 2355 MULI MULTIPOLYGON (((-122.29736 37.88114, -122.2974...
1856 57831.625 1113 1126.78 1857 M MULI 054 1763 MULI MULTIPOLYGON (((-122.28949 37.85835, -122.2893...
1857 63278.188 1532 1248.658 1858 NC C-SA 052 1527 C MULTIPOLYGON (((-122.26951 37.84853, -122.2695...
1858 77694.688 1556 1456.261 1859 NC C-AC 052 1435 C MULTIPOLYGON (((-122.27333 37.84618, -122.2733...
1859 28538.438 1525 718.455 1860 MDR R-2A 052 1533 R MULTIPOLYGON (((-122.27278 37.84918, -122.2732...

1860 rows × 9 columns

In [ ]:
blocks.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1860 entries, 0 to 1859
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   area       1860 non-null   object  
 1   blocks_id  1860 non-null   object  
 2   perimeter  1860 non-null   object  
 3   objectid   1860 non-null   object  
 4   genplan    1834 non-null   object  
 5   zoneclass  1860 non-null   object  
 6   new_block  1802 non-null   object  
 7   zone       1816 non-null   object  
 8   geometry   1860 non-null   geometry
dtypes: geometry(1), object(8)
memory usage: 130.9+ KB

With 1860 entries, we can see that some features (genplan, new_block, zone) are not fully populated. This can cause problems for us as we collate data from other sources, so it's really important we clean these issues now and impute any necessary information.

In [ ]:
blocks["genplan"].unique()
Out[ ]:
array(['LDR', 'BC', 'MDR', 'NC', 'X', 'HDR', 'M', 'MU', 'I', 'OS', 'W',
       'DT', 'RMU', None], dtype=object)
In [ ]:
len(blocks[blocks["genplan"].isna()])
Out[ ]:
26

Looks like we have 26 null values for this feature. The genplan identifiers correspond to codes outlined in Berkeley's Land Use Element of the published General Plan linked here.

A synopsis of land use classifications:

  • LDR (Low Density Residential) - These areas are generally characterized by single-family homes. Appropriate uses for these areas include:

residential, community services, schools, home occupations, recreational uses, and open spaces. Building intensity: 1-10 dwelling units per net acres, not including secondary units, population density: <=22 persons per acre.

  • MDR (Medium Density Residential) - These areas of Berkeley are generally characterized by a mix of single-family homes and small to

medium sized multi-family structures. Building intensity: 20-40 dwelling units per net acre, population density: 44-88 persons per acre.

  • HDR (High Density Residential) - In Berkeley, these areas are generally characterized by large, multi-family structures conveniently located

near transit, the Downtown, the University campus, or BART. Building intensity: 40-100 dwelling units per net acre, population density: 88-220 persons per net acre.

  • RMU (Residential Mixed Use) - These areas are generally characterized by a diverse mixture of residential, commercial and institutional

structures, in close proximity to transit and major shopping and employment centers. Building intensity: 40-100 dwelling units per net acre, population density: 88-220 persons per net acre.

  • NC (Neighborhood Commercial) - These areas of the city are generally characterized by pedestrian-oriented, neighborhood-serving

commercial development, and multi-family residential structures. Population density: 44-88 persons per acre.

  • AC (Avenue Commercial) - These areas of Berkeley are characterized by pedestrian-oriented commercial development and multi-

family residential structures. Building intensity will generally range from a Floor Area Ratio (FAR) of less than 1 to an FAR of 5. Population density: 44-88 persons per acre

  • DT (Downtown Mixed-Use) - The-Downtown Area contains areas characterized by high density commercial, office, arts, culture, and entertainment and residential development, which are designated as Downtown Mixed-Use. Diverse uses and the "highest building intensity"
  • I (Institutional) - These are areas of Berkeley for institutional, government, educational, recreational, open space, natural habitat, woodlands, and public service uses and facilities, such as the University of California, BART, Berkeley Unified School District, and East Bay Municipal Utility District facilities. Within these areas, building intensity will generally range from a Floor Area Ratio (FAR) of less than 1 to an FAR of 4.
  • M (Manufacturing) - These areas are intended to maintain and preserve areas of Berkeley for manufacturing and industrial uses necessary for a multi-faceted economy and job growth. Within these areas, building intensity will generally range from a Floor Area Ratio (FAR) of less than 1 to an FAR of 2.
  • MU (Mixed-Use) - These areas are intended to maintain and preserve areas of the city for lighter manufacturing and industrial uses and allow for additional uses, including residential, where determined appropriate by zoning, and only if the use will not weaken Berkeley’s manufacturing and industrial economy. Appropriate uses for these areas are identified in the West Berkeley Plan. intensity will generally range from a Floor Area Ratio (FAR) of less than 1 to an FAR of 1.5. Population

density: 22-44 persons per acre, where housing is allowed.

  • W (Waterfront/Marina) - These areas are intended to maintain and preserve areas of Berkeley adjacent to the Bay for open space, recreational uses, waterfront-related commercial and visitor services, boating, and water transit facilities.
  • OS (Open Space) - These areas of the city are appropriate for parks, open space, pathways, recreational facilities, natural habitat, and woodlands.

Land use classifications also have corresponding compatible zoning district types:

  • LDR -> R-1A , R-2
  • MDR -> R-2A, R-3
  • HDR -> R-4, R-5, R-S
  • RMU -> R SMU
  • CN -> C-N, C-E, C-NS, C-SO, C-SA
  • AC -> C-SA, C-1, C-T, C-W
  • DT -> R-2A, R-3, R04
  • I -> None
  • M -> M, MM, MU-LI
  • MUR -> MU-R
In [ ]:
blocks[blocks["genplan"].isna()]
Out[ ]:
area blocks_id perimeter objectid genplan zoneclass new_block zone geometry
1682 0 0 0 1847 None C-DMU Corr None None MULTIPOLYGON (((-122.26730 37.86573, -122.2669...
1722 65.443 0 281.056 1719 None R-2 None None MULTIPOLYGON (((-122.28377 37.86140, -122.2837...
1724 146.194 0 250.698 1721 None R-2 None None MULTIPOLYGON (((-122.28400 37.86174, -122.2839...
1725 2.341 0 31.723 1722 None R-2 None None MULTIPOLYGON (((-122.28377 37.86140, -122.2837...
1781 0 0 0 1848 None C-DMU Buff None None MULTIPOLYGON (((-122.26692 37.86578, -122.2667...
1810 0 0 0 1849 None C-DMU Corr None None MULTIPOLYGON (((-122.26740 37.86661, -122.2670...
1811 0 0 0 1850 None C-DMU Buff None None MULTIPOLYGON (((-122.26701 37.86666, -122.2666...
1812 0 0 0 1851 None C-DMU Buff None None MULTIPOLYGON (((-122.26593 37.86702, -122.2662...
1834 0 0 0 1830 None C-DMU Buff None None MULTIPOLYGON (((-122.26772 37.87387, -122.2672...
1835 0 0 0 1831 None C-DMU Corr None None MULTIPOLYGON (((-122.26840 37.87378, -122.2677...
1836 0 0 0 1832 None C-DMU Oute None None MULTIPOLYGON (((-122.26837 37.87349, -122.2671...
1837 0 0 0 1833 None C-DMU Buff None None MULTIPOLYGON (((-122.26949 37.87293, -122.2695...
1838 0 0 0 1834 None C-DMU Corr None None MULTIPOLYGON (((-122.26873 37.87378, -122.2686...
1839 0 0 0 1835 None R-2A None None MULTIPOLYGON (((-122.26964 37.87367, -122.2690...
1840 0 0 0 1836 None C-DMU Corr None None MULTIPOLYGON (((-122.26884 37.86791, -122.2689...
1841 0 0 0 1837 None C-DMU Buff None None MULTIPOLYGON (((-122.26892 37.86790, -122.2688...
1842 0 0 0 1838 None C-DMU Corr None None MULTIPOLYGON (((-122.26864 37.86740, -122.2680...
1843 0 0 0 1839 None C-DMU Buff None None MULTIPOLYGON (((-122.26840 37.86706, -122.2683...
1844 0 0 0 1840 None C-DMU Corr None None MULTIPOLYGON (((-122.26793 37.86654, -122.2678...
1845 0 0 0 1841 None C-DMU Buff None None MULTIPOLYGON (((-122.26835 37.86649, -122.2682...
1846 0 0 0 1842 None R-3 None None MULTIPOLYGON (((-122.26915 37.86637, -122.2691...
1847 0 0 0 1843 None R-3 None None MULTIPOLYGON (((-122.26988 37.86539, -122.2686...
1848 0 0 0 1844 None C-DMU Buff None None MULTIPOLYGON (((-122.26868 37.86555, -122.2684...
1849 0 0 0 1845 None C-DMU Corr None None MULTIPOLYGON (((-122.26825 37.86561, -122.2678...
1850 0 0 0 1846 None C-SA None None MULTIPOLYGON (((-122.26572 37.86426, -122.2659...
1851 0 0 0 1852 None C-DMU Oute None None MULTIPOLYGON (((-122.26742 37.86683, -122.2675...

Most of the areas with no gen plans are designated "C-DMU". These are Commercial Downtown Mixed Use areas. Looking at the visualization, it seems these are institutional buildings with commercial centers (ex: Berkeley Way West). For our use case, it's appropriate to label these as downtown (DT), since the genplan issue seems to stem from the lack of a name for a insitution/commercial hybrid structure.

In [ ]:
indexes = blocks.loc[blocks["genplan"].isna() & blocks["zoneclass"].str.startswith("C-DMU")].index
blocks.loc[indexes, "genplan"] = "DT"
In [ ]:
blocks[blocks["genplan"].isna()]
Out[ ]:
area blocks_id perimeter objectid genplan zoneclass new_block zone geometry
1722 65.443 0 281.056 1719 None R-2 None None MULTIPOLYGON (((-122.28377 37.86140, -122.2837...
1724 146.194 0 250.698 1721 None R-2 None None MULTIPOLYGON (((-122.28400 37.86174, -122.2839...
1725 2.341 0 31.723 1722 None R-2 None None MULTIPOLYGON (((-122.28377 37.86140, -122.2837...
1839 0 0 0 1835 None R-2A None None MULTIPOLYGON (((-122.26964 37.87367, -122.2690...
1846 0 0 0 1842 None R-3 None None MULTIPOLYGON (((-122.26915 37.86637, -122.2691...
1847 0 0 0 1843 None R-3 None None MULTIPOLYGON (((-122.26988 37.86539, -122.2686...
1850 0 0 0 1846 None C-SA None None MULTIPOLYGON (((-122.26572 37.86426, -122.2659...

These are our last few blocks without genplans. Let's put them into a df and explore to figure out what's up.

In [ ]:
outliers = blocks[blocks["genplan"].isna()]
outliers.explore()
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

One of these objects (id: 1725) is a line with no data (potentially a bad input). We'll remove that one from our dataset. The others seem to be directly adjacent to special non-residential structures (open spaces, stores, fire and police stations). In all cases they are residential areas, and therefore should be given a residential zone code. We'll assign Residential Mixed Use (RMU) gen codes to these spaces.

In [ ]:
blocks.drop(index=1725)
outliers.drop(index=1725)
blocks.loc[outliers.index, "genplan"] = "RMU"
In [ ]:
blocks.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1860 entries, 0 to 1859
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   area       1860 non-null   object  
 1   blocks_id  1860 non-null   object  
 2   perimeter  1860 non-null   object  
 3   objectid   1860 non-null   object  
 4   genplan    1860 non-null   object  
 5   zoneclass  1860 non-null   object  
 6   new_block  1802 non-null   object  
 7   zone       1816 non-null   object  
 8   geometry   1860 non-null   geometry
dtypes: geometry(1), object(8)
memory usage: 130.9+ KB

Having blocks without new_block assignments is ok. We'll use blocks_id as our primary key for blocks, and maintain new_block feature for when we collate this data with other sets that may only reference areas with new_block ids.

In [ ]:
blocks["zone"].unique()
Out[ ]:
array(['R', 'C', None, 'MM', 'MUR', 'MULI', 'M', 'U', 'X', 'SP', 'C-W'],
      dtype=object)
In [ ]:
nozones = blocks[blocks["zone"].isna()]
nozones.explore()
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Same deal here with zones. It seems like areas with mixed usage have not been labeled correctly in certain areas. Again, for our use cases it's best to label them ourselves with the appropriate label according to the outlined standards.

In [ ]:
blocks.loc[nozones["zoneclass"].index, "zone"] = "MUR"
In [ ]:
blocks.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1860 entries, 0 to 1859
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   area       1860 non-null   object  
 1   blocks_id  1860 non-null   object  
 2   perimeter  1860 non-null   object  
 3   objectid   1860 non-null   object  
 4   genplan    1860 non-null   object  
 5   zoneclass  1860 non-null   object  
 6   new_block  1802 non-null   object  
 7   zone       1860 non-null   object  
 8   geometry   1860 non-null   geometry
dtypes: geometry(1), object(8)
memory usage: 130.9+ KB

Awesome! Our dataset is clean and interpretable. We can now integrate other datasets and do some analysis. We'll start by integrating information about service inquiries to the city via 311 calls.

In [ ]:
inquiries = makeDataFrame("raw/311.csv")
inquiries
Out[ ]:
Case_ID Date_Opened Case_Status Date_Closed Request_Category Request_SubCategory Request_Detail Object_Type APN Street_Address City State Neighborhood Latitude Longitude Location
0 121000877593 09/16/2021 06:23:23 AM Closed 09/20/2021 11:22:22 AM Facilities, Electrical & Property Management Parks/Marina Building Services Keys / Locks Property NaN Intersection of Browning and Addison, BERKELEY... Berkeley CA Berkeley NaN NaN NaN
1 121000876647 09/13/2021 10:50:00 AM Open NaN Refuse and Recycling Residential Residential Bulky Pickup Property 054 180702800 1722 DWIGHT WAY Berkeley CA Berkeley 37.862656 -122.275461 (37.86265624, -122.27546088)
2 121000809740 11/06/2020 04:51:00 PM Closed 11/09/2020 01:52:57 AM General Questions/information Miscellaneous Miscellaneous Service Request Individual NaN NaN Berkeley CA Berkeley NaN NaN NaN
3 121000809739 11/06/2020 04:38:00 PM Closed 11/09/2020 01:41:12 AM General Questions/information Miscellaneous Miscellaneous Service Request Property 060 249305600 1411 GRIZZLY PEAK BLVD Berkeley CA Berkeley 37.884799 -122.247874 (37.88479918, -122.24787412)
4 121000793663 09/01/2020 11:32:00 AM Closed 09/01/2020 11:36:00 AM Other Account Services and Billing Marina Payment Collection - Marina Individual NaN NaN Berkeley CA Berkeley NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
654695 121000971475 12/04/2022 12:21:27 PM Open NaN General Questions/information Miscellaneous Miscellaneous Internet Request Property 057 208701100 1029 HEARST AVE Berkeley CA Berkeley 37.870447 -122.293627 (37.87044703, -122.29362715)
654696 121000971482 12/04/2022 04:16:35 PM Open NaN General Questions/information Miscellaneous Miscellaneous Internet Request Unknown NaN NaN Berkeley CA Berkeley NaN NaN NaN
654697 121000971476 12/04/2022 12:53:20 PM Open NaN General Questions/information Miscellaneous Miscellaneous Internet Request Unknown NaN NaN Berkeley CA Berkeley NaN NaN NaN
654698 121000971481 12/04/2022 04:04:51 PM Open NaN Streets, Utilities, and Transportation Clean City Program Illegal Dumping - Internet Request Property 052 152201100 1615 SIXTY-THIRD ST Berkeley CA Berkeley 37.847632 -122.275050 (37.84763248, -122.27505004)
654699 121000971478 12/04/2022 01:35:38 PM Open NaN General Questions/information Miscellaneous Miscellaneous Internet Request Unknown NaN NaN Berkeley CA Berkeley NaN NaN NaN

654700 rows × 16 columns

In [ ]:
inquiries.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654700 entries, 0 to 654699
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Case_ID              654700 non-null  int64  
 1   Date_Opened          654700 non-null  object 
 2   Case_Status          654700 non-null  object 
 3   Date_Closed          621774 non-null  object 
 4   Request_Category     654700 non-null  object 
 5   Request_SubCategory  654700 non-null  object 
 6   Request_Detail       654700 non-null  object 
 7   Object_Type          654700 non-null  object 
 8   APN                  375370 non-null  object 
 9   Street_Address       413262 non-null  object 
 10  City                 654700 non-null  object 
 11  State                654700 non-null  object 
 12  Neighborhood         654700 non-null  object 
 13  Latitude             370569 non-null  float64
 14  Longitude            370569 non-null  float64
 15  Location             370569 non-null  object 
dtypes: float64(2), int64(1), object(13)
memory usage: 79.9+ MB
In [ ]:
inquiries["Neighborhood"].unique()
Out[ ]:
array(['Berkeley'], dtype=object)

Looks like only ~50% of the records have specific location data. Let's filter these out.

In [ ]:
inquiries.drop(index=inquiries[inquiries["Latitude"].isna()].index, inplace=True)
In [ ]:
# We only want unique cases
inquiries = inquiries.groupby('Case_ID', as_index=False).first()
inquiries.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370562 entries, 0 to 370561
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Case_ID              370562 non-null  int64  
 1   Date_Opened          370562 non-null  object 
 2   Case_Status          370562 non-null  object 
 3   Date_Closed          342846 non-null  object 
 4   Request_Category     370562 non-null  object 
 5   Request_SubCategory  370562 non-null  object 
 6   Request_Detail       370562 non-null  object 
 7   Object_Type          370562 non-null  object 
 8   APN                  370562 non-null  object 
 9   Street_Address       370562 non-null  object 
 10  City                 370562 non-null  object 
 11  State                370562 non-null  object 
 12  Neighborhood         370562 non-null  object 
 13  Latitude             370562 non-null  float64
 14  Longitude            370562 non-null  float64
 15  Location             370562 non-null  object 
dtypes: float64(2), int64(1), object(13)
memory usage: 45.2+ MB

We'll need a method to join a dataframe with our blocks data by latitude and longitude. We'll then add 2 new features to the dataset:

  • Inquiries - # of requests made to 311 for services
  • InquiryCategories - a list of unique request categories made
In [ ]:
def makeGeoDataFrame(df):
    """
    Given a pandas dataframe with columns=["Latitude", "Longitude"],
    return a GeoDataFrame with point geometry
    """
    gdf = geopandas.GeoDataFrame(
        df, geometry=geopandas.points_from_xy(df["Longitude"], df["Latitude"]), crs=espg4326
    )
    return gdf

def joinOnCoordinate(polys, points):
    """
    Given a GeoDataframe with polygons and 
    a GeoDataframe with point geometries, return 
    a GeoDataframe of the join.
    """
    points.crs = polys.crs
    joined = points.sjoin_nearest(polys, how="right")
    return joined
In [ ]:
inquiries = inquiries.loc[:, ["Request_Category", "APN", "Latitude", "Longitude"]]
counts = inquiries.groupby(["Latitude", "Longitude"]).count()["Request_Category"]
requestCategories = inquiries.groupby(["Latitude", "Longitude"]).agg(lambda x: ','.join(x.unique()))["Request_Category"]
inquiries = inquiries.groupby(["Latitude", "Longitude"]).first()
inquiries["serviceRequests"] = counts
inquiries["requestCategories"] = requestCategories
inquiries = inquiries.reset_index()
inquiries
Out[ ]:
Latitude Longitude Request_Category APN serviceRequests requestCategories
0 37.845970 -122.275596 Refuse and Recycling 052 143600102 6 Refuse and Recycling
1 37.846022 -122.275217 Refuse and Recycling 052 143501200 8 Refuse and Recycling,General Questions/informa...
2 37.846045 -122.275786 Refuse and Recycling 052 143600200 20 Refuse and Recycling,Streets, Utilities, and T...
3 37.846141 -122.275065 Streets, Utilities, and Transportation 052 143501100 5 Streets, Utilities, and Transportation,Refuse ...
4 37.846147 -122.275633 Refuse and Recycling 052 143600101 14 Refuse and Recycling,Streets, Utilities, and T...
... ... ... ... ... ... ...
24900 37.904624 -122.272952 Refuse and Recycling 062 294603001 5 Refuse and Recycling,Streets, Utilities, and T...
24901 37.904645 -122.273908 Refuse and Recycling 062 294900200 5 Refuse and Recycling,General Questions/informa...
24902 37.904723 -122.273179 Refuse and Recycling 062 294603200 3 Refuse and Recycling
24903 37.904787 -122.273322 Streets, Utilities, and Transportation 062 294603300 3 Streets, Utilities, and Transportation,Refuse ...
24904 37.904851 -122.273458 Refuse and Recycling 062 294603400 11 Refuse and Recycling,Parks, Trees and Vegetati...

24905 rows × 6 columns

In [ ]:
inquiries = makeGeoDataFrame(inquiries)
new_blocks = joinOnCoordinate(blocks, inquiries)
new_blocks
/Users/atharvashirke/pcs2022/doublespeak/.venv/lib/python3.10/site-packages/geopandas/array.py:344: UserWarning: Geometry is in a geographic CRS. Results from 'sjoin_nearest' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  warnings.warn(
Out[ ]:
index_left Latitude Longitude Request_Category APN serviceRequests requestCategories area blocks_id perimeter objectid genplan zoneclass new_block zone geometry
0 24771 37.903241 -122.274976 Refuse and Recycling 062 294902700 11 Refuse and Recycling 256240.562 3 2942.173 1 LDR R-1H 062 2949 R MULTIPOLYGON (((-122.27356 37.90460, -122.2735...
1 24888 37.904229 -122.269848 Refuse and Recycling 063 295102304 9 Refuse and Recycling,General Questions/informa... 111175 5 1484.545 2 LDR R-1H 063 2951 R MULTIPOLYGON (((-122.26997 37.90433, -122.2699...
2 24749 37.903118 -122.277698 Refuse and Recycling 062 294205500 10 Refuse and Recycling,General Questions/informa... 170184.812 6 2637.073 3 LDR R-1H 062 2942 R MULTIPOLYGON (((-122.27484 37.90388, -122.2745...
3 24461 37.901399 -122.266383 Traffic and Transportation 063 310002500 5 Traffic and Transportation,Refuse and Recyclin... 307217.938 7 3510.71 4 LDR R-1H 063 3100 R MULTIPOLYGON (((-122.26898 37.90397, -122.2684...
4 24671 37.902659 -122.272248 Refuse and Recycling 062 294503200 8 Refuse and Recycling,General Questions/informa... 101674.438 8 1421.052 5 LDR R-1H 062 2945 R MULTIPOLYGON (((-122.27198 37.90388, -122.2719...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1855 17037 37.881194 -122.297565 Government Activity 060 235500804 10 Government Activity,Streets, Utilities, and Tr... 157345.375 436 1725.794 1856 M MULI 060 2355 MULI MULTIPOLYGON (((-122.29736 37.88114, -122.2974...
1856 5029 37.858108 -122.289271 General Questions/information 054 176300800 14 General Questions/information,Refuse and Recyc... 57831.625 1113 1126.78 1857 M MULI 054 1763 MULI MULTIPOLYGON (((-122.28949 37.85835, -122.2893...
1857 325 37.848776 -122.269204 Streets, Utilities, and Transportation 052 152700103 2 Streets, Utilities, and Transportation 63278.188 1532 1248.658 1858 NC C-SA 052 1527 C MULTIPOLYGON (((-122.26951 37.84853, -122.2695...
1858 16 37.846375 -122.273335 Refuse and Recycling 052 143500102 68 Refuse and Recycling,Traffic and Transportatio... 77694.688 1556 1456.261 1859 NC C-AC 052 1435 C MULTIPOLYGON (((-122.27333 37.84618, -122.2733...
1859 401 37.849198 -122.273006 Environmental Services and Programs 052 153300400 21 Environmental Services and Programs,Refuse and... 28538.438 1525 718.455 1860 MDR R-2A 052 1533 R MULTIPOLYGON (((-122.27278 37.84918, -122.2732...

1860 rows × 16 columns

In [ ]:
new_blocks = new_blocks.drop(columns=["Latitude", "Longitude", "index_left", "Request_Category"])
new_blocks = new_blocks.rename(columns={"requestCategories":"inquiryCategories", "serviceRequests":"inquiries"})
new_blocks.explore()
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
len(blocks) == len(new_blocks)
Out[ ]:
True

The range of service request values is really large. We'll add a log feature for better visualization.

In [ ]:
new_blocks["ln_inquiries"] = np.log(new_blocks.inquiries)
In [ ]:
new_blocks.explore(
    column="ln_inquiries",
    scheme="Quantiles",
    tooltip="inquiries",
    k = 20,
    popup=True,
)
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Chloropleth visualization of service requests in Berkeley by block.

Next we'll try to integrate some public safety data. We'll start with Berkeley City Auditor Police Response dataset. This rich dataset should provide us information regarding police responses from 2015-2019 fiscal years.

In [ ]:
police = makeDataFrame("raw/pdResponse.csv")
police
/var/folders/mn/xk12q_t57y1889l8g92dyp8m0000gn/T/ipykernel_25295/845972379.py:9: DtypeWarning: Columns (9,13) have mixed types. Specify dtype option on import or set low_memory=False.
  return pd.read_csv(targetPath)
Out[ ]:
CFSID IncidentNumber Address Lat Lon Address Location Type POLICE AREA CallType CallType Description Priority ... Call Disposition UNIT DISPOSITION Occurred Incident Type UCR Return A Code PrimaryUnitFlag UnitNumber DispatchTime EnRouteTime OnSceneTime ClearTime
0 -683358 2015-00000001 2200 BLOCK CHANNING WAY BERKELEY 94704 37.866528 -122.263839 Address 6W 1042 CFS 1042 - Welfare Check 2 ... MDT Narrative Only NaN NaN NaN 0 S4 01/01/2015 12:14:52 AM NaN 2015-01-01T00:15:02.000 01/01/2015 12:19:38 AM
1 -683358 2015-00000001 2200 BLOCK CHANNING WAY BERKELEY 94704 37.866528 -122.263839 Address 6W 1042 CFS 1042 - Welfare Check 2 ... MDT Narrative Only NaN NaN NaN 1 3A6 01/01/2015 12:14:49 AM NaN 2015-01-01T00:15:04.000 01/01/2015 12:43:03 AM
2 -683359 2015-00000007 2000 BLOCK SHATTUCK AVE BERKELEY 94704 37.871559 -122.268679 Address 4E 415E CFS 415E - Noise Disturbance 4 ... MDT Narrative Only NaN NaN NaN 1 S31 01/01/2015 12:43:54 AM 2015-01-01T00:44:40.000 2015-01-01T00:45:51.000 01/01/2015 12:49:32 AM
3 -683364 2015-00000006 2400 BLOCK GRANT ST BERKELEY 94703 37.864138 -122.274771 Address 5W AID CFS AID - Aid to Citizen 4 ... MDT Narrative Only NaN NaN NaN 1 S31 01/01/2015 12:42:13 AM 2015-01-01T00:42:15.000 NaN 01/01/2015 12:42:54 AM
4 -683367 2015-00000014 2200 BLOCK GRANT ST BERKELEY 94703 37.867370 -122.275101 Address 5W 415E CFS 415E - Noise Disturbance 4 ... MDT Narrative Only NaN NaN NaN 1 4A5 01/01/2015 01:38:10 AM 2015-01-01T01:38:12.000 2015-01-01T01:39:45.000 01/01/2015 01:40:53 AM
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
750107 -1333694 2020-00045341 SAN PABLO AVE BERKELEY 94702 37.873443 -122.293531 Intersection 15S T CFS T - Traffic Stop 0 ... RIPA NaN NaN NaN 0 4A13 09/30/2020 11:53:25 PM 2020-09-30T23:54:25.000 NaN 09/30/2020 11:56:02 PM
750108 -1333694 2020-00045341 SAN PABLO AVE BERKELEY 94702 37.873443 -122.293531 Intersection 15S T CFS T - Traffic Stop 0 ... RIPA NaN NaN NaN 1 4B9 09/30/2020 11:53:21 PM 2020-09-30T23:53:21.000 2020-09-30T23:53:21.000 09/30/2020 11:56:14 PM
750109 -1333695 2020-00045342 8TH ST BERKELEY 94710 37.865704 -122.294634 Intersection 14S 1194 CFS 1194 - Pedestrian Stop 0 ... NaN RIPA NaN NaN 0 4A3 09/30/2020 11:59:23 PM 2020-09-30T23:59:29.000 2020-09-30T23:59:29.000 10/01/2020 12:12:32 AM
750110 -1333695 2020-00045342 8TH ST BERKELEY 94710 37.865704 -122.294634 Intersection 14S 1194 CFS 1194 - Pedestrian Stop 0 ... NaN RIPA NaN NaN 0 4A13 09/30/2020 11:58:21 PM 2020-09-30T23:58:25.000 2020-10-01T00:15:20.000 10/01/2020 12:31:54 AM
750111 -1333695 2020-00045342 8TH ST BERKELEY 94710 37.865704 -122.294634 Intersection 14S 1194 CFS 1194 - Pedestrian Stop 0 ... NaN RIPA NaN NaN 1 4B9 09/30/2020 11:58:14 PM 2020-09-30T23:58:14.000 2020-09-30T23:58:14.000 10/01/2020 12:17:10 AM

750112 rows × 22 columns

In [ ]:
police = police.loc[:, ["Lat", "Lon", "CallType Description", "Occurred Incident Type"]]
police
Out[ ]:
Lat Lon CallType Description Occurred Incident Type
0 37.866528 -122.263839 CFS 1042 - Welfare Check NaN
1 37.866528 -122.263839 CFS 1042 - Welfare Check NaN
2 37.871559 -122.268679 CFS 415E - Noise Disturbance NaN
3 37.864138 -122.274771 CFS AID - Aid to Citizen NaN
4 37.867370 -122.275101 CFS 415E - Noise Disturbance NaN
... ... ... ... ...
750107 37.873443 -122.293531 CFS T - Traffic Stop NaN
750108 37.873443 -122.293531 CFS T - Traffic Stop NaN
750109 37.865704 -122.294634 CFS 1194 - Pedestrian Stop NaN
750110 37.865704 -122.294634 CFS 1194 - Pedestrian Stop NaN
750111 37.865704 -122.294634 CFS 1194 - Pedestrian Stop NaN

750112 rows × 4 columns

In [ ]:
police.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750112 entries, 0 to 750111
Data columns (total 4 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Lat                     750112 non-null  float64
 1   Lon                     750112 non-null  float64
 2   CallType Description    750061 non-null  object 
 3   Occurred Incident Type  155651 non-null  object 
dtypes: float64(2), object(2)
memory usage: 22.9+ MB

Looks like a very small portion of this dataset has confirmed incidents. Let's filter out calls that don't have confirmed incidents. and just have a dataframe with coordinates and occured crimes.

In [ ]:
police = police[~police["Occurred Incident Type"].isna()].loc[:, ["Lat", "Lon", "Occurred Incident Type"]]
police = police.rename(columns={"Lat":"Latitude", "Lon":"Longitude"})
counts = police.groupby(["Latitude", "Longitude"]).count()["Occurred Incident Type"]
incidentTypes = police.groupby(["Latitude", "Longitude"]).agg(lambda x: ','.join(x.unique()))["Occurred Incident Type"]
police = police.groupby(["Latitude", "Longitude"]).first()
police["confirmedCases"] = counts
police["incidentTypes"] = incidentTypes
police = police.drop(columns=["Occurred Incident Type"])
police = police.reset_index()
police
Out[ ]:
Latitude Longitude confirmedCases incidentTypes
0 -361.000000 -361.000000 1147 Narcotics Violation,Fraud/Forgery,Vehicle Code...
1 37.634523 -122.112417 2 Fraud/Forgery
2 37.660245 -122.091605 2 Narcotics Violation
3 37.668383 -122.087013 21 Narcotics Violation,Fraud/Forgery
4 37.675897 -121.779499 1 Identity Theft
... ... ... ... ...
12655 37.904305 -122.269621 8 Vehicle Stolen,Burglary Commercial
12656 37.904328 -122.273225 10 Burglary Residential
12657 37.904358 -122.270274 5 Vandalism,Burglary Commercial
12658 37.904620 -122.272948 2 Vandalism,Vehicle Stolen
12659 37.904875 -122.273458 11 Theft from Auto,Vandalism

12660 rows × 4 columns

Item 0 is clearly a null value they place confirmed cases without a location. We should drop this value.

In [ ]:
police = police.drop(index=0)
police = makeGeoDataFrame(police)
newer_blocks = joinOnCoordinate(new_blocks, police)
/Users/atharvashirke/pcs2022/doublespeak/.venv/lib/python3.10/site-packages/geopandas/array.py:344: UserWarning: Geometry is in a geographic CRS. Results from 'sjoin_nearest' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  warnings.warn(
In [ ]:
newer_blocks
Out[ ]:
index_left Latitude Longitude confirmedCases incidentTypes APN inquiries inquiryCategories area blocks_id perimeter objectid genplan zoneclass new_block zone geometry ln_inquiries
0 12656 37.904328 -122.273225 10 Burglary Residential 062 294902700 11 Refuse and Recycling 256240.562 3 2942.173 1 LDR R-1H 062 2949 R MULTIPOLYGON (((-122.27356 37.90460, -122.2735... 2.397895
1 12617 37.902783 -122.269852 4 Death Investigation 063 295102304 9 Refuse and Recycling,General Questions/informa... 111175 5 1484.545 2 LDR R-1H 063 2951 R MULTIPOLYGON (((-122.26997 37.90433, -122.2699... 2.197225
2 12618 37.902790 -122.275249 1 Identity Theft 062 294205500 10 Refuse and Recycling,General Questions/informa... 170184.812 6 2637.073 3 LDR R-1H 062 2942 R MULTIPOLYGON (((-122.27484 37.90388, -122.2745... 2.302585
3 12613 37.902589 -122.266585 1 Vehicle Stolen 063 310002500 5 Traffic and Transportation,Refuse and Recyclin... 307217.938 7 3510.71 4 LDR R-1H 063 3100 R MULTIPOLYGON (((-122.26898 37.90397, -122.2684... 1.609438
4 12620 37.902830 -122.272560 1 Vehicle Stolen 062 294503200 8 Refuse and Recycling,General Questions/informa... 101674.438 8 1421.052 5 LDR R-1H 062 2945 R MULTIPOLYGON (((-122.27198 37.90388, -122.2719... 2.079442
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1855 10490 37.881783 -122.297754 36 Vandalism,Burglary Commercial,Robbery 060 235500804 10 Government Activity,Streets, Utilities, and Tr... 157345.375 436 1725.794 1856 M MULI 060 2355 MULI MULTIPOLYGON (((-122.29736 37.88114, -122.2974... 2.302585
1856 3117 37.858415 -122.289523 5 Vandalism 054 176300800 14 General Questions/information,Refuse and Recyc... 57831.625 1113 1126.78 1857 M MULI 054 1763 MULI MULTIPOLYGON (((-122.28949 37.85835, -122.2893... 2.639057
1857 374 37.848766 -122.269190 10 Narcotics Violation 052 152700103 2 Streets, Utilities, and Transportation 63278.188 1532 1248.658 1858 NC C-SA 052 1527 C MULTIPOLYGON (((-122.26951 37.84853, -122.2695... 0.693147
1858 131 37.846396 -122.273312 20 Disturbance,Theft Misdemeanor (Under $950),Dom... 052 143500102 68 Refuse and Recycling,Traffic and Transportatio... 77694.688 1556 1456.261 1859 NC C-AC 052 1435 C MULTIPOLYGON (((-122.27333 37.84618, -122.2733... 4.219508
1859 445 37.849303 -122.273144 2 Assault/Battery Misdemeanor 052 153300400 21 Environmental Services and Programs,Refuse and... 28538.438 1525 718.455 1860 MDR R-2A 052 1533 R MULTIPOLYGON (((-122.27278 37.84918, -122.2732... 3.044522

1860 rows × 18 columns

In [ ]:
newer_blocks.explore(
    column="confirmedCases",
    scheme="Quantiles",
    tooltip="confirmedCases",
    k = 20,
    popup=True,
)
/Users/atharvashirke/pcs2022/doublespeak/.venv/lib/python3.10/site-packages/mapclassify/classifiers.py:238: UserWarning: Warning: Not enough unique values in array to form k classes
  Warn(
/Users/atharvashirke/pcs2022/doublespeak/.venv/lib/python3.10/site-packages/mapclassify/classifiers.py:241: UserWarning: Warning: setting k to 11
  Warn("Warning: setting k to %d" % k_q, UserWarning)
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Chloropleth map of police response activity to confirmed cases.

In [ ]: